外部套件下載及應用

林嶔 (Lin, Chin)

Lesson 7

第一節:快速讀寫檔案(1)

– 不同的主因其實牽涉到程式碼執行過程,但目前我們的知識很有限,做出來就不容易了更別說還要求速度快,因此比較可行的方式是去「抄」程式碼!

第一節:快速讀寫檔案(2)

– 至於安裝套件的方法,假設你是用Rstudio,可以看到右下角有個Packages分頁,點選後你能看到Install按鍵,透過這種方式就能安裝套件了

F7_1

install.packages("data.table")
library(data.table)

第一節:快速讀寫檔案(3)

– 至於大Data在哪,我們可以使用第五節課用到的大檔案,請在這裡下載。

t0 = Sys.time()
dat1 = read.csv('data3_4.csv', header = TRUE, fileEncoding = 'CP950')
Sys.time() - t0
## Time difference of 1.562384 secs
t0 = Sys.time()
dat2 = fread('data3_4.csv', header = TRUE)
Sys.time() - t0
## Time difference of 0.07405019 secs

第一節:快速讀寫檔案(4)

class(dat1)
## [1] "data.frame"
class(dat2)
## [1] "data.table" "data.frame"
t0 = Sys.time()
dat2 = fread('data3_4.csv', header = TRUE, data.table = FALSE)
Sys.time() - t0
## Time difference of 0.1066926 secs
class(dat2)
## [1] "data.frame"

第一節:快速讀寫檔案(5)

all.equal(dat1, dat2)
## [1] "Component \"COLLECTIONDATE\": 'current' is not a factor"
## [2] "Component \"TESTNAME\": 'current' is not a factor"      
## [3] "Component \"UNITS\": 'current' is not a factor"
dat1 = read.csv('data3_4.csv', header = TRUE, stringsAsFactors = FALSE, fileEncoding = 'CP950')
all.equal(dat1, dat2)
## [1] "Component \"COLLECTIONDATE\": 181483 string mismatches"

第一節:快速讀寫檔案(6)

– 這是使用「write.csv」寫出的速度:

t0 = Sys.time()
write.csv(dat1, 'data3_4(1).csv', row.names = FALSE, quote = TRUE)
Sys.time() - t0
## Time difference of 0.9300547 secs

– 這是使用「fwrite」寫出的速度:

t0 = Sys.time()
fwrite(dat2, 'data3_4(2).csv', row.names = FALSE, quote = TRUE)
Sys.time() - t0
## Time difference of 0.04712176 secs

第一節:快速讀寫檔案(7)

– 這是儲存

t0 = Sys.time()
save(dat1, file = 'data3_4.RData')
Sys.time() - t0
## Time difference of 0.6017618 secs

– 這是載入

t0 = Sys.time()
load('data3_4.RData')
Sys.time() - t0
## Time difference of 0.2020478 secs

練習1:讀取圖片及顯示圖片

– 透過Google搜尋「R display image」後,你將可以找到這個頁面,其中第一個連結進去後你會發現這裡已經有人發問和回答了:

F7_2

練習1答案

library("jpeg")
img <- readJPEG(system.file("img", "Rlogo.jpg", package = "jpeg"))
plot(0:1, 0:1, type = "n", ann = FALSE, axes = FALSE)
rasterImage(img, -0.04, -0.04, 1.04, 1.04)

第二節:增加程式可讀性(1)

length(levels(factor(dat1$TESTNAME)))
## [1] 25
factorized_TESTNAME = factor(dat1$TESTNAME)
lvl_TESTNAME = levels(factorized_TESTNAME)
length(lvl_TESTNAME)
## [1] 25

第二節:增加程式可讀性(2)

– 後面函數的「.」代表上一步的結果

library(magrittr)

n.TESTNAME = dat1$TESTNAME %>% factor %>% levels %>% length
n.TESTNAME
## [1] 25
n.TESTNAME = dat1$TESTNAME %>% factor() %>% levels() %>% length()
n.TESTNAME
## [1] 25
n.TESTNAME = dat1$TESTNAME %>% factor(.) %>% levels(.) %>% length(.)
n.TESTNAME
## [1] 25
f = function(x, a, b) {a*x^2 + b}
1:5 %>% f(., 2, 5)
## [1]  7 13 23 37 55
1:5 %>% f(2, ., 5)
## [1]  9 13 17 21 25
1:5 %>% f(2, 5, .)
## [1] 21 22 23 24 25

第二節:增加程式可讀性(3)

– 「%<>%」:不要顯示結果,而是改變物件內容

a = 1
a %<>% add(1)
a
## [1] 2

– 「%$%」:指定物件內的索引格式

n.TESTNAME = dat1 %$% TESTNAME %>% factor %>% levels %>% length
n.TESTNAME
## [1] 25

第二節:增加程式可讀性(4)

t0 = Sys.time()

dat1$COLLECTIONDATE = dat1[,3] %>% as.Date

levels.TESTNAME = dat1[,4] %>% factor %>% levels
n.TESTNAME = levels.TESTNAME %>% length
levels.PATNUMBER = dat1[,1] %>% factor %>% levels
n.PATNUMBER = levels.PATNUMBER %>% length

dat_list = list()

for (i in 1:n.PATNUMBER) {
  
  subdat = dat1[dat1[,1]==levels.PATNUMBER[i],]
  levels.COLLECTIONDATE = subdat[,3] %>% factor %>% levels
  n.COLLECTIONDATE = levels.COLLECTIONDATE %>% length

  submatrix = matrix(NA, nrow = n.COLLECTIONDATE, ncol = n.TESTNAME + 2)
  colnames(submatrix) = c("PATNUMBER", "COLLECTIONDATE", levels.TESTNAME)

  submatrix[,1] = levels.PATNUMBER[i]
  submatrix[,2] = levels.COLLECTIONDATE

  for (j in 1:n.COLLECTIONDATE) {
    subsubdat = subdat[subdat[,3]==levels.COLLECTIONDATE[j],]
    for (k in 1:nrow(subsubdat)) {
      NAME = subsubdat[k,4]
      position = which(NAME == levels.TESTNAME) + 2
      submatrix[j, position] = subsubdat[k,5]
    }
  }
  
  dat_list[[i]] = submatrix
}

final.data = do.call("rbind", dat_list)

Sys.time() - t0
## Time difference of 1.55415 mins
head(final.data)
##      PATNUMBER COLLECTIONDATE Albumin Albumin body fluid AST BUN BUN Fluid
## [1,] "26"      "2011-05-12"   NA      NA                 NA  NA  NA       
## [2,] "26"      "2011-05-30"   NA      NA                 NA  NA  NA       
## [3,] "26"      "2011-05-31"   NA      NA                 NA  NA  NA       
## [4,] "26"      "2011-06-01"   NA      NA                 NA  NA  NA       
## [5,] "26"      "2011-06-02"   NA      NA                 NA  NA  NA       
## [6,] "26"      "2011-06-06"   NA      NA                 NA  NA  NA       
##      Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC) HDL-Cholesterol
## [1,] NA                "1.8"      NA               NA      NA             
## [2,] NA                "3"        NA               NA      NA             
## [3,] NA                "2.9"      NA               NA      NA             
## [4,] NA                "2.9"      NA               NA      NA             
## [5,] NA                "2.4"      NA               NA      NA             
## [6,] NA                "1.9"      NA               NA      NA             
##      IP    K  LDL-Cholesterol Na    NA Fluid Total Calcium
## [1,] NA    NA NA              "140" NA       NA           
## [2,] NA    NA NA              "139" NA       NA           
## [3,] "4.6" NA NA              "145" NA       "7.8"        
## [4,] NA    NA NA              "144" NA       "6.1"        
## [5,] NA    NA NA              "138" NA       "7.1"        
## [6,] NA    NA NA              "134" NA       "8"          
##      Total Cholesterol Triglyceride Triglycerol Fluid Uric Acid
## [1,] NA                NA           NA                NA       
## [2,] NA                NA           NA                NA       
## [3,] "134"             "131"        NA                NA       
## [4,] NA                NA           NA                NA       
## [5,] NA                NA           NA                NA       
## [6,] NA                NA           NA                NA       
##      urine Calcium urine Phosphorus urine Potassium urine Sodium
## [1,] NA            NA               NA              NA          
## [2,] NA            NA               NA              NA          
## [3,] "5.6"         "41.4"           "29"            "61"        
## [4,] NA            NA               NA              NA          
## [5,] NA            NA               NA              NA          
## [6,] NA            NA               NA              NA          
##      urine Uric Acid
## [1,] NA             
## [2,] NA             
## [3,] NA             
## [4,] NA             
## [5,] NA             
## [6,] NA
final.data = as.data.frame(final.data, stringsAsFactors = FALSE)

fwrite(final.data, 'final_data.csv', row.names = FALSE, quote = TRUE)

練習2:學習看看說明做事情

for (i in 1:n.TESTNAME) {
  final.data[,i+2] = as.numeric(final.data[,i+2])
}

– 假設我們想做出鈣磷乘積:

final.data_1 = final.data

final.data_1[,"Cap"] = final.data_1[,"Total Calcium"] * final.data_1[,"IP"]

head(final.data_1)
##   PATNUMBER COLLECTIONDATE Albumin Albumin body fluid AST BUN BUN Fluid
## 1        26     2011-05-12      NA                 NA  NA  NA        NA
## 2        26     2011-05-30      NA                 NA  NA  NA        NA
## 3        26     2011-05-31      NA                 NA  NA  NA        NA
## 4        26     2011-06-01      NA                 NA  NA  NA        NA
## 5        26     2011-06-02      NA                 NA  NA  NA        NA
## 6        26     2011-06-06      NA                 NA  NA  NA        NA
##   Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC) HDL-Cholesterol
## 1                NA        1.8               NA      NA              NA
## 2                NA        3.0               NA      NA              NA
## 3                NA        2.9               NA      NA              NA
## 4                NA        2.9               NA      NA              NA
## 5                NA        2.4               NA      NA              NA
## 6                NA        1.9               NA      NA              NA
##    IP  K LDL-Cholesterol  Na NA Fluid Total Calcium Total Cholesterol
## 1  NA NA              NA 140       NA            NA                NA
## 2  NA NA              NA 139       NA            NA                NA
## 3 4.6 NA              NA 145       NA           7.8               134
## 4  NA NA              NA 144       NA           6.1                NA
## 5  NA NA              NA 138       NA           7.1                NA
## 6  NA NA              NA 134       NA           8.0                NA
##   Triglyceride Triglycerol Fluid Uric Acid urine Calcium urine Phosphorus
## 1           NA                NA        NA            NA               NA
## 2           NA                NA        NA            NA               NA
## 3          131                NA        NA           5.6             41.4
## 4           NA                NA        NA            NA               NA
## 5           NA                NA        NA            NA               NA
## 6           NA                NA        NA            NA               NA
##   urine Potassium urine Sodium urine Uric Acid   Cap
## 1              NA           NA              NA    NA
## 2              NA           NA              NA    NA
## 3              29           61              NA 35.88
## 4              NA           NA              NA    NA
## 5              NA           NA              NA    NA
## 6              NA           NA              NA    NA

– 在仔細閱讀說明後,我們發現有個「mutate」函數,他能做到我們想做的事情,現在請你試著使用mutate函數來做出鈣磷乘積!

練習2答案

library(dplyr)

final.data_1 = final.data

final.data_1 %<>% mutate(CrNa = Creatinine * IP)
colnames(final.data_1)[18] = 'Total_Calcium'

final.data_1 %<>% mutate(Cap = Total_Calcium * Na)

第三節:遺漏值插補(1)

final.data_1 = final.data

levels.PATNUMBER = final.data_1[,1] %>% factor %>% levels
n.PATNUMBER = levels.PATNUMBER %>% length

i = 1
sub_final.data_1 = final.data_1[final.data_1[,1] == levels.PATNUMBER[i],]
sub_final.data_1[is.na(sub_final.data_1[,'IP']),'IP'] = mean(sub_final.data_1[,'IP'], na.rm = TRUE)

第三節:遺漏值插補(2)

final.data_1 = final.data

levels.PATNUMBER = final.data_1[,1] %>% factor %>% levels
n.PATNUMBER = levels.PATNUMBER %>% length
levels.TESTNAME = colnames(final.data_1)[-c(1:2)]
n.TESTNAME = levels.TESTNAME %>% length

dat_list = list()

for (i in 1:n.PATNUMBER) {
  sub_final.data_1 = final.data_1[final.data_1[,1] == levels.PATNUMBER[i],]
  for (j in 1:n.TESTNAME) {
    sub_final.data_1[is.na(sub_final.data_1[,levels.TESTNAME[j]]),levels.TESTNAME[j]] = mean(sub_final.data_1[,levels.TESTNAME[j]], na.rm = TRUE)
  }
  dat_list[[i]] = sub_final.data_1
}

final.data_1 = do.call("rbind", dat_list)

– 那這樣再考考你,你覺得應該怎麼做?

第三節:遺漏值插補(3)

final.data_1 = final.data

levels.PATNUMBER = final.data_1[,1] %>% factor %>% levels
n.PATNUMBER = levels.PATNUMBER %>% length
levels.TESTNAME = colnames(final.data_1)[-c(1:2)]
n.TESTNAME = levels.TESTNAME %>% length

dat_list = list()

i = 1
sub_final.data_1 = final.data_1[final.data_1[,1] == levels.PATNUMBER[i],]
j = 1
value_pos = which(!is.na(sub_final.data_1[,levels.TESTNAME[j]]))
if (length(value_pos)!=0) {
  k = 1
  if (is.na(sub_final.data_1[k,levels.TESTNAME[j]])) {
    impute_seq = which.min(abs(value_pos - k))
    impute_pos = value_pos[impute_seq]
    sub_final.data_1[k,levels.TESTNAME[j]] = sub_final.data_1[impute_pos,levels.TESTNAME[j]]
  }
}

第三節:遺漏值插補(4)

final.data_1 = final.data

levels.PATNUMBER = final.data_1[,1] %>% factor %>% levels
n.PATNUMBER = levels.PATNUMBER %>% length
levels.TESTNAME = colnames(final.data_1)[-c(1:2)]
n.TESTNAME = levels.TESTNAME %>% length

dat_list = list()

for (i in 1:n.PATNUMBER) {
  sub_final.data_1 = final.data_1[final.data_1[,1] == levels.PATNUMBER[i],]
  for (j in 1:n.TESTNAME) {
    value_pos = which(!is.na(sub_final.data_1[,levels.TESTNAME[j]]))
    if (length(value_pos)!=0) {
      for (k in 1:nrow(sub_final.data_1)) {
        if (is.na(sub_final.data_1[k,levels.TESTNAME[j]])) {
          impute_seq = which.min(abs(value_pos - k))
          impute_pos = value_pos[impute_seq]
          sub_final.data_1[k,levels.TESTNAME[j]] = sub_final.data_1[impute_pos,levels.TESTNAME[j]]
        }
      }
    }
  }
}

final.data_1 = do.call("rbind", dat_list)

期中作業:找出適當的作法及套件進行資料插補

F7_4

  1. 你對目前這份資料型態的了解

  2. 你所選用的方法及其原理

  3. 插補過程的程式碼(含檔案)

  4. 你所引用的資料來源

小結

– 遺漏值插補是資料處理中最有學問的部分,你之後會發現未來在做任何事情的時候其實是不允許遺漏值存在的,而遺漏值插補的過程會嚴重的影響你未來工作的效果!

– 但你可以直接把「輪子」買來,但你也要有能力確保若廠商不供貨時,你也可以靠自己把「輪子」製造出來!